rm(list=ls())
library(foreign)

### Load in data files
cand.raw <- read.dta("../data/candidates.dta")
party.raw <- read.dta("../data/parties.dta")

### Basic merge/cleaning

## Make sure candidates data is sorted correctly for analysis
cand.raw <- cand.raw[with(cand.raw, order(COUNTRY, REGION, PARTY, LISTNUM)),]

## Get rid of uppercase variable names
names(cand.raw) <- tolower(names(cand.raw))

## first, check all candidate codes
cand.parties <- unique(cand.raw$pirdeucode)

## Drop candidates for which we lack party info
for (pc in cand.parties) {
  if (sum(party.raw$party_code==pc)<1) {
    todrop <- which(cand.raw$pirdeucode==pc)
    print(paste("Dropping party", pc, 
    "because we have no party info.  This includes",  length(todrop), 
      "candidates, rows:"))
    print(todrop)
    cand.raw <- cand.raw[-todrop,]
  }
}

cand.parties <- unique(cand.raw$pirdeucode)

## Check for single-candidate parties (there are none)
for (pc in cand.parties) if (sum(party.raw$party_code==pc)>1) print(pc)

## drop parties that don't show up in the candidate data
party.select <- sapply(party.raw$party_code, 
  function (pc) any(pc==cand.parties))
party.raw <- party.raw[party.select,]


##  add party info to candidate stuff
rownames(party.raw) <- party.raw$party_code

### Party context
cand.raw$ingov <- party.raw[as.character(cand.raw$pirdeucode), "v1_12"]
cand.raw$propseats <- party.raw[as.character(cand.raw$pirdeucode), "propseats"]

### ideology
cand.raw$pro_anti_eu <- party.raw[as.character(cand.raw$pirdeucode), "pro_anti_eu"]

### emphasis
party.raw$emph.europe <- party.raw$per_v1_108 + party.raw$per_v2_108 + party.raw$per_v3_108 + party.raw$per_v4_108 + party.raw$per_v1_110 + party.raw$per_v2_110 + party.raw$per_v3_110 + party.raw$per_v4_110
cand.raw$emph.europe <- party.raw[as.character(cand.raw$pirdeucode), "emph.europe"]

### Group
cand.raw$group <- party.raw[as.character(cand.raw$pirdeucode), "v1_111stgroup"]

## Fix some missing groups
cand.raw$group[cand.raw$pirdeucode==1348442] <- 3 # Hungarian SZDSZ is ALDE
cand.raw$group[cand.raw$pirdeucode==1203321] <- 1 # Czech SNK ED is EPP
cand.raw$group[cand.raw$pirdeucode==1372951] <- 6 # Sinn Fein is GUE/NGL
cand.raw$group[cand.raw$pirdeucode==1372110] <- 4 # Irish greens are green
cand.raw$group[cand.raw$pirdeucode==1203110] <- 4 # Czech Greens (SZ)
cand.raw$group[cand.raw$pirdeucode==1203021] <- 8 # Czech Nezavisli I/D

## Code big three (NAs are zeros here)
cand.raw$big3 <- cand.raw$group < 4
cand.raw$big3[is.na(cand.raw$big3)] <- 0

## Selection centralization
cand.raw$natselect2 <- party.raw[as.character(cand.raw$pirdeucode), "v043_2avg"]

### Some bookkeeping and error correction
rownames(cand.raw) <- 1:nrow(cand.raw)

## Custom fix for list 1642401 NATION.  Sets listnum 2 to 1.
for (i in 1:nrow(cand.raw)) {
  if (cand.raw$pirdeucode[i] == "1642401" & cand.raw$region[i] =="NATION" &
        cand.raw$listnum[i] == 2)
    cand.raw$listnum[i] <- 1
}

### Set up the dv
gamma <- matrix(0, nrow=nrow(cand.raw), ncol=4)
gamma[cand.raw$heldlocalelectpos==1, 2] <- 1
gamma[cand.raw$heldregionalelectpos == 1, 2] <- 1
gamma[cand.raw$heldnationalelectpos==1, 3] <- 1
gamma[apply(gamma, 1, sum)==0, 1] <- 1
gamma[cand.raw$incumbent==1,4] <- 1
colnames(gamma) <- c("non", "reg", "nat", "inc")
row.names(gamma) <- cand.raw$candid

save.image(file="../data/merged.RData")
